clear 
est clear


**********************************************************************
* Prep: baseline data
**********************************************************************

*** Prep and check baseline control vars *************************************** 
*
use "$data\SMU_CleanData_Panel_Public", replace
sort cr_id location_branch location_club round

*** keep baseline 
keep if round==0 

*** Age 
su CR_age, d
* create age var called age for convenience 
drop age 
gen age = CR_age 

*** ever pregnant 
fre s_eversex
fre f_everpreg
tab f_everpreg s_eversex, m 
* ever pregnant is zero if never had sex 
gen everpregnant = 0 		if f_everpreg==0 | s_eversex==0
replace everpregnant = 1 	if f_everpreg==1  
* is missing in 51 cases, due to either sex of preg Q's missing 
tab everpregnant
mdesc everpregnant

*** currently married
*	count `as if' married as married
fre ms_currentlymarried
gen evermarried = inlist(ms_currentlymarried,1,2,3,4,5,6) if !missing(ms_currentlymarried) & ms_currentlymarried>=0

*** list of controls 
local controls age everpregnant evermarried PPI_score phq8_score
su `controls'
mdesc `controls' 
 
*** for missings, replace with median and create dummy indicating missingness 
foreach v of varlist everpregnant evermarried {
	di "var: `v'"
	* missing indicator 
	gen mis_`v' = missing(`v')
	tab mis_`v'
	* replace with median
	qui su `v' , d
	replace `v' = r(p50) if mis_`v'==1 
	* check 
	tab `v' mis_`v'
}

*** updated list of controls, 
*	note: exclude phq8 from here because we'll get it in the next dataset 
local controls_ age everpregnant mis_everpregnant evermarried mis_evermarried PPI_score 
* rename for convenience 
local controls = " "
foreach v of local controls_ {
	rename `v' `v'_b
	local controls = " `controls' `v'_b"
}
di "`controls'"


*** Merge in sampling weights 
* 10 CRs missing from sampling weights file... hence the assert(match master)
merge 1:1 cr_id using "${data}/PhaseII_status_weights.dta", assert(match master) nogen keepusing(sample_weight)
tab sample_weight
*create a weight for each phase, set unweighted to 1 
gen w1 = 1 if att_rr==0 
gen w2 = 1 if att_mid==0
gen w3 = 1 if att_end==0
replace w3 = sample_weight if !missing(sample_weight) & att_end==0
label var w1 "Weights for RR, all equal to 1"
label var w2 "Weights for BL, all equal to 1"
label var w3 "Weights for EL, PhaseII selection probabilities"
drop sample_weight
* fix sample weight error 
replace w3 = 1 if w3 == 2



*** keep relevant vars and save 
keep cr_id `controls' w?
* center baseline controls 
foreach v of local controls {
	qui su `v', meanonly
	replace `v'=`v'-r(mean)
} 
* Fix up labels 
label var PPI_score_b 			"PPI Score at BL, centered"
label var age_b 				"Age at BL, centered"
label var everpregnant_b 		"=1 if Ever pregnant at BL, centered"
label var evermarried_b 		"=1 if Ever married at BL, centered"
label var mis_everpregnant_b 	"=1 if missing Ever pregnant at BL, centered"
label var mis_evermarried_b		"=1 if missing Ever married at BL, centered"


*** save 
tempfile controls_baseline 
save `controls_baseline', replace

 
 
 

 
 


*** Prep and check mental health outcome vars **********************************

use "$data\SMU_CleanData_Panel_Public", replace
sort cr_id location_branch location_club round

* remove baseline data, we only work with fups now 
*drop if round==0

* first, get a sense of how many obs we have in each round 
drop if (att_rr==1 | att_midline==1 | att_endline==1) & round>0
label define round 0 "Baseline" 1 "Rapid Resurvey" 2 "12 Months" 3 "24 Months"
label val round round 
tab round

***create a current age variable 
gen age_current = final_cr_age
replace age_current = CR_age if round == 0
replace age_current = cr_age_final if missing(age_current) & round == 3 

*** PHQ-8 
* missing n=3 from rapid resurvey, they have fairly incomplete surveys...
bys round: su phq8_score
* phq minimal missing from RR... do it here for now 
tab round phq8_minimal
replace phq8_minimal = 0 if phq8_score>4 & !missing(phq8_score) & round==1
replace phq8_minimal = 1 if phq8_score<=4 & !missing(phq8_score) & round==1
table phq8_score round phq8_minimal 
* count number of missings 
local phq_vars phq8_interest phq8_sad phq8_asleep phq8_tired phq8_appetite phq8_worth phq8_concentration phq8_slowly
egen nm_phq=rownonmiss(`phq_vars') 
tab round nm_phq 
* check missings,
*	there's an issue where we have the ghq_min but not the score for those with 
*	missings ghq items, so recalculate both
gen m_phq_min=missing(phq8_min)
gen m_phq_score=missing(phq8_score)
tab m_phq_min m_phq_score, m 
 
/*************
	SO, for the phq8 we are only missing 3 phq's from RR so we just drop those from the sample... 
*************/
drop if m_phq_min==1 & m_phq_score==1 
mdesc phq*

*** GHQ-12 
tab GHQ12_score round 
*for convenience
rename (GHQ12_score GHQ12_minimal) (ghq12_score ghq12_minimal) 
* do the dummy four baseline 
replace ghq12_minimal = 1 if ghq12_score<3 & !missing(ghq12_score) & round==0
replace ghq12_minimal = 0 if ghq12_score>=3 & !missing(ghq12_score) & round==0
table ghq12_score round ghq12_minimal 
* count number of missings 
local ghq_vars mh_ghq12_concentrate mh_ghq12_sleep mh_ghq12_useful mh_ghq12_capable mh_ghq12_understrain mh_ghq12_difficulties mh_ghq12_daytoday mh_ghq12_problems mh_ghq12_unhappy mh_ghq12_confidence mh_ghq12_worthless mh_ghq12_happy
egen nm_ghq=rownonmiss(`ghq_vars') 
tab nm_ghq round 

* check missings,
*	there's an issue where we have the ghq_min but not the score for those with 
*	missings ghq items, so recalculate both
gen m_ghq_min=missing(ghq12_min)
gen m_ghq_score=missing(ghq12_score)
*br mh_ghq* m_ghq_* if m_ghq_min==0 & m_ghq_score==1
* check why missing...
*	because they are phase-iii phone surveys, with veeeery few q's answered (i.e. super brief end phase)
tab m_ghq_score continue_1, m
tab m_ghq_score continue_2, m

/*************
	SO, for the ghq12 just keep those who answered the full ghq...
*************/
 
* quick cross check of the main outcomes
* 	we have 23 GHQ's missing at baseline and 22 at Endline
tab m_ghq_score m_phq_score , m
table round m_ghq_score m_phq_score
* it's not the same CR's...
*br round cr_id if m_phq_score==0 & m_ghq_score==1
distinct cr_id if m_phq_score==0 & m_ghq_score==1
 
/*************
	We have 45 CR's with a phq but no GHQ, 23 at baseline and 22 at endline, do we drop them??? 
	- No, instead use the PHQ at baseline for all regressions (it's due to tracking QQ)
*************/

*** Self-Esteem, needs to be calculated for endline, so just do it for all 
tab rosb_score round 
* vars for score
local rosb_vars mh_rosb_satisfied mh_rosb_nogood mh_rosb_qualities mh_rosb_abletodo mh_rosb_notproud mh_rosb_useless mh_rosb_worth mh_rosb_respect mh_rosb_failure mh_rosb_attitude
* fix em up a bit
foreach v of local rosb_vars {
	* coded on different scale in last round
	replace `v'=`v'-1 if round==3 & `v'>=0
	* check it 
	tab `v' round , m 
	* remove missings
	replace `v'=. if `v'<0 
} 
* count number of missings 
egen nm=rownonmiss(`rosb_vars') 
tab nm 
* calculate score 
egen rosb = rowtotal(`rosb_vars')  if nm==10
* compare to old one, we're good
bys round: corr rosb rosb_score
* replace old one 
replace rosb_score = rosb if missing(rosb_score) & round==3 & !missing(rosb)
tab rosb_score round
* clean up 
drop nm rosb

*** Resilience 
tab rscore round
* again missing at endline 
local res_vars res_getalong res_education res_behave res_lookout res_knowalot res_hungry res_time res_talkfeel res_friends res_school res_familycare res_friendscare res_fairly res_chances res_safe res_useful res_celebrate
* fix em up a bit
foreach v of local res_vars {
	* check it 
	tab `v' round , m 
	* remove missings
	replace `v'=. if `v'<0 
} 
* count number of missings 
egen nm=rownonmiss(`res_vars') 
tab nm  
* calculate score 
egen res = rowtotal(`res_vars') if nm==17
* compare to old one, we're good
bys round: corr res rscore 
* replace old one 
replace rscore = res if missing(rscore) & round==3 & !missing(res)
tab rscore round
* clean up 
drop nm res 
rename rscore res_score 

*** Locus of control 
tab control round
* remove missings
replace control=. if control<0
rename control loc_score















 
*** Prep and check human capital outcome vars **********************************

*** School enrollment 
* check with the 'in session' var, for each round specifically
bys round: tab ed_insession ed_inschool, m
* all good in rounds 3, fix up the round 1 (RR)
tab ed_insession ed_enrolled_notinsess if round==1, m
tab ed_insession ed_enrolled_insess	   if round==1, m
replace ed_inschool = ed_enrolled_notinsess if round==1 & missing(ed_inschool) & !missing(ed_enrolled_notinsess)
replace ed_inschool = ed_enrolled_insess if round==1 & missing(ed_inschool) & !missing(ed_enrolled_insess)
* baseline is a mess, after some digging this fix seems to make sense (should look at cleaning do files to confirm)
replace ed_inschool = 0 if missing(ed_inschool) & !missing(ed_highestlevel) &   missing(ed_levelattended_insess) &  missing(ed_levelattended_notinsess)
replace ed_inschool = 1 if missing(ed_inschool) &  missing(ed_highestlevel) & (!missing(ed_levelattended_insess) | !missing(ed_levelattended_notinsess))
replace ed_inschool = 1 if missing(ed_inschool) &  missing(ed_highestlevel) & (!missing(ed_levelattended_insess) | !missing(ed_levelattended_notinsess))
* in round 2 (12 months), use whether they were in school when covid hit 
tab ed_lckd ed_covid if round==2, m 
* create a new enrolled variable 
gen 	ed_enrolled = ed_inschool if inlist(round,0,1,3)
replace ed_enrolled = ed_covid if inlist(round,2)
fre ed_inschool
* for those who have aged out of school but completed their secondary education, set as inschool == 1
replace ed_inschool = 1 if ed_highest_qualification >= 3 & !missing(ed_highest_qualification)


 
*** competencies 
* check by round, 
tab comp_score round, m
* create the literacy variable in round 3 for convenience 
replace com_literacy = 1 if illiterate_adultcr == 0 & !missing(illiterate_adultcr) & round == 3 
replace com_literacy = 0 if illiterate_adultcr == 1 & !missing(illiterate_adultcr) & round == 3 
replace com_literacy = 1 if illiterate_nonemanccr == 0 & !missing(illiterate_nonemanccr) & round == 3 
replace com_literacy = 0 if illiterate_nonemanccr == 1 & !missing(illiterate_nonemanccr) & round == 3 
* now calculate the correct answers for 24 months round 
* scenario a
bys round: tab com_scenarioa_answer if sca_ans==1 
replace sca_ans = 0 if  ((com_scenarioa_answer != 2900 & com_literacy==1) | com_literacy==0) & round == 3
replace sca_ans = 1 if  (com_scenarioa_answer == 2900  & com_literacy==1) & round == 3
* scenario b
bys round: tab com_scenariob_answer if scb_ans==1 
replace scb_ans = 0 if  ((com_scenariob_answer != 3800 & com_literacy==1) | com_literacy==0) & round == 3
replace scb_ans = 1 if  (com_scenariob_answer == 3800  & com_literacy==1) & round == 3
* scenario c
bys round: tab com_scenarioc_answer if scc_ans==1 
replace scc_ans = 0 if  ((com_scenarioc_answer != 3000 & com_literacy==1) | com_literacy==0) & round == 3
replace scc_ans = 1 if  (com_scenarioc_answer == 3000  & com_literacy==1) & round == 3
* check 
foreach s in a b c {
	tab round sc`s'_ans, m
}
* now the score 
gen competencies = sca_ans+scb_ans+scc_ans 
* check but not in baseline because 0-4 or endline
tab comp_score competencies if !inlist(round,0,3)


*** Self-Efficacy 
* only available at mid and endline
tab se_score round, m 


*** Incidence of pregnancy, i.e. a change in status (how many new pregnant CRs)
* gotta fix up everpregnant at baseline and RR first 
fre s_eversex
fre f_everpreg
bys round: tab f_everpreg s_eversex, m 
* ever pregnant is zero if never had sex 
gen everpregnant = 0 		if (f_everpreg==0 | s_eversex==0) & inlist(round,0,1)
replace everpregnant = 1 	if f_everpreg==1 & inlist(round,0,1)
replace everpregnant = f_everpreg if inlist(round,2,3)
* is missing in 51 cases, due to either sex of preg Q's missing 
tab round everpregnant, m
mdesc everpregnant
* now use panel structure to create the incidence 
sort cr_id round
* check, there are some inconsistent cases...
bys cr_id: gen check=everpregnant[_n]<everpregnant[_n-1] & !missing(everpregnant[_n-1])
tab round check, m
rename check check_everpreg
* now the change 
gen xpreg_b = everpregnant if round == 0
bys cr_id: egen preg_b = max(xpreg_b)
gen newlypregnant = everpregnant == 1 & preg_b == 0 if !missing(everpregnant) & !missing(preg_b)
 
*** Teen pregnancies 
gen preg_fage = f_agepreg 
replace preg_fage =  . if f_agepreg<0
* now, create a teenage pregnancy dummy 
gen teenpreg = preg_fage<20 if !missing(everpregnant)
* baseline value 
gen xteenpreg_b = teenpreg if round == 0
bys cr_id: egen teenpreg_b = max(xteenpreg_b)
gen newlyteenpreg = teenpreg == 1 & teenpreg_b == 0 if !missing(everpregnant) & !missing(teenpreg_b)

 

*** incidence of marriage
*	count `as if' married as married
*	means divorced/separated/widowed are not married... 
tab round ms_currentlymarried
fre ms_currentlymarried
* create evermarried 
gen evermarried = inlist(ms_currentlymarried,1,2,3,4,5,6) if !missing(ms_currentlymarried) & ms_currentlymarried>=0
tab ms_currentlymarried evermarried, m
bys round: tab evermarried ms_currentlymarried, m 
* now use panel structure to create the incidence 
sort cr_id round
* check, there are some inconsistent cases...
bys cr_id: gen check=evermarried[_n]<evermarried[_n-1] & !missing(evermarried[_n-1])
tab round check, m
rename check check_evermarr
* now the change 
gen xmarr_b = evermarried if round == 0
bys cr_id: egen marr_b = max(xmarr_b)
gen newlymarried = evermarried == 1 & marr_b == 0 if !missing(evermarried) & !missing(marr_b)

*** Child marriage 
gen marriage_fage = ms_agemarried
replace marriage_fage =  . if marriage_fage<0
* now, create an underage marriage dummy 
gen childmarr = marriage_fage<18 if !missing(evermarried)
* baseline value 
gen xchildmarr_b = childmarr if round == 0
bys cr_id: egen childmarr_b = max(xchildmarr_b)
gen newlychildmarr = childmarr == 1 & childmarr_b == 0 if !missing(evermarried) & !missing(childmarr_b)



*** risky_sex 
fre s_eversex
fre s_lastcondom
* used condom last time is zero if never had sex 
gen riskysex = 0 		if (s_lastcondom==0 | s_eversex==0) & inlist(round,0,1)
replace riskysex = 1 	if (s_lastcondom==1)  				& inlist(round,0,1)
replace riskysex = risky_sex if inlist(round,2,3)
* is missing in some cases, due to either sex of preg Q's missing 
bys round: tab riskysex risky_sex , m
tab round riskysex, m

bys round: su ed_enrolled competencies se_score everpregnant evermarried riskysex


*** Desired fertility 
tab f_nrchild round, m
tab f_nrchild //, m
* some values are crazy, top code it? winsorize it
su f_nrchild, d
replace f_nrchild = r(p99) if f_nrchild>r(p99) & !missing(f_nrchild)
* it's only 31 changes across all waves, not so bad...
gen desired_fert = f_nrchild if f_nrchild>=0 & !missing(f_nrchild)
* plot per round, to check the weird numbers, not much really... 
*local r 2
*tw hist desired_fert if randomization==0 & round==`r', discrete lcolor(orange) fcolor(orange%0.5) || ///
*	hist desired_fert if randomization==1 & round==`r', discrete lcolor(blue) fcolor(blue%0.5) || ///
*	hist desired_fert if randomization==2 & round==`r', discrete lcolor(red) fcolor(red%0.5) , ///
*	legend(rows(1))
	 
*** Time preferences 
tab hyp_110k round, m
gen time_pref = hyp_110k if !missing(hyp_110k) & hyp_110k>=0
tab time_pref hyp_110k, m
tab round time_pref, m


*** Expectations of paid work 
tab pw_25yo round, m
gen paid_work = (pw_25yo/10) if !missing(pw_25yo) & pw_25yo>=0
tab paid_work round, m


*** Life expectancy 
tab f_alive40 round, m
gen life_exp = (f_alive40/10) if !missing(f_alive40) & f_alive40>=0
tab life_exp round , m

 
*** Educational aspirations 
tab ed_qualgoal round , m 
tab ed_lvlgoal round , m
tab ed_lvlgoal ed_qualgoal  , m
fre ed_qualgoal ed_lvlgoal
* use wants above college from qualgoal...
gen ed_aspiration = ed_qualgoal>=4 if !missing(ed_qualgoal) & ed_qualgoal>=0
tab round ed_aspiration , m


*** Time to pregnancy 
tab f_hopedyrs round, m
fre f_hopedyrs
* use over 5 years... 
gen wantpreg = !(f_hopedyrs>=4 | f_hopedyrs==-98) if !missing(f_hopedyrs) & (f_hopedyrs>=0 | f_hopedyrs==-98)
replace wantpreg = 1 if everpregnant==1
bys round: tab f_hopedyrs wantpreg, m


*** Time to marriage 
tab ms_yrshoped round, m
fre ms_yrshoped
* use over 5 years... 
gen wantmarr = !(ms_yrshoped>=4 | ms_yrshoped==-98) if !missing(ms_yrshoped) & (ms_yrshoped>=0 | ms_yrshoped==-98)
replace wantmarr = 1 if evermarried==1
bys round: tab ms_yrshoped wantmarr, m



 







 
  


*** Clean up and save **********************************************************

* labels
label var phq8_score "PHQ-8"
label var phq8_minimal "PHQ-8<=4"
label var ghq12_score "GHQ-12"
label var ghq12_minimal "GHQ-12<3"
label var rosb_score "Self-Esteem"
label var res_score "Resilience"
label var loc_score "Locus of Control"
label var ed_enrolled "Enrolled in school"
label var competencies "Competencies"
label var se_score "Self-Efficacy"
label var everpregnant "Ever pregnant"
label var newlypregnant "Pregnant since baseline"
label var newlyteenpreg "Teenage pregnancy since baseline"
label var newlychildmarr "Child marriage since baseline"
label var evermarried "Ever married"
label var newlymarried "Married since baseline"
label var riskysex "Risky sex"
label var desired_fert "Desired fertility"
label var time_pref "Time pref. (patient)"
label var paid_work "Pr. paid work by 25yo"
label var life_exp "Pr. alive at age 40"
label var ed_aspiration "Aspires to college/university"
label var wantmarr "Marriage, within 10 yrs"
label var wantpreg "Pregnancy, within 10 yrs"
label var age_current "Age in current survey, after corrections"



* rename some vars for convenience 
rename location_club club
rename (phq8_minimal ghq12_minimal) (phq8_min ghq12_min)
* outcomes 
local outcomes phq8_score phq8_min ghq12_score ghq12_min rosb_score res_score loc_score ed_enrolled competencies se_score everpregnant newlypregnant newlyteenpreg evermarried newlymarried newlychildmarr riskysex desired_fert time_pref paid_work life_exp ed_aspiration wantpreg wantmarr
local othervars age_current 

*** save analysis dataset 
* keep relevant vars
keep cr_id round block club randomization `outcomes' `othervars'
*save labels for reshape 
foreach v in  `outcomes' `othervars' {
	local vlab_`v': variable label `v'
} 

*** reshape wide 
reshape wide `outcomes' `othervars' , i(cr_id block club) j(round)
*** 

* relabel nicely 
foreach v in  `outcomes' `othervars' {
	forvalues r=0/3 {
		label var `v'`r' "`vlab_`v''"
	}
}
* drop if all missing 
ds * 
foreach v in `r(varlist)' {
	gen x=missing(`v')
	qui su x, meanonly 
	if r(mean)==1 {
		di "drop: `v'"
		drop `v'
	}
	drop x
}
* clean up and save
*	keep names as they are...  more convenient for loops probably later on
* merge in baseline controls though 
merge 1:1 cr_id  using `controls_baseline', nogen assert(match)

*** treatment indicators 
* arm specific 
order randomization, after(club)
rename randomization treat 
label define treat 0"Control" 1"IPT-G" 2"IPT-G+"
label val treat treat
* any 
gen treat_ther = treat!=0 
label define treat_ther 0"Control" 1"IPT-G"
label val treat_ther treat_ther
label var treat_ther "Treatment, combined IPT-G"
* cash 
gen treat_cash = treat==2 if treat!=0
label define treat_cash 0"IPT-G Only" 1"IPT-G+"
label val treat_cash treat_cash
label var treat_cash "Treatment, IPT-G+"

* center the baseline outcomes, except for:
* phq8_minimal, loc_control, se_score, which we don't have at BL
* everpregnant evermarried we already have 
local outcomes_to_center phq8_score ghq12_score ghq12_min rosb_score res_score ed_enrolled competencies riskysex  wantpreg wantmarr
foreach v of local outcomes_to_center {
	qui su `v'0, meanonly 
	replace `v'0 = `v'0 - r(mean) 
	rename `v'0 `v'_b
	order `v'_b, last
}
* labels 
label var phq8_score_b		"PHQ-8 Score at BL, centered"
label var ghq12_score_b 	"GHQ-12 Score at BL, centered"
label var ghq12_min_b		"=1 if GHQ-12<3 at BL, centered"
label var rosb_score_b 		"Self Esteem Score at BL, centered"
label var res_score_b 		"Resilience Score at BL, centered"
label var ed_enrolled_b 	"Enrolled at BL, centered"
label var competencies_b 	"Competencies at BL, centered"
label var riskysex_b		"Risky sex at BL, centered"
label var wantpreg_b		"Time to pregnancy at BL, centered"
label var wantmarr_b		"Time to marriage at BL, centere	d"
label var treat 			"Treatment"
*/
* order 
order w1 w2 w3, last 
order treat_ther, after(treat)

* save 
save "${data}/Analysis_wide.dta", replace
*/
